library(data.table)
library(DataCombine)
library(foreign)
library(fracdiff)
library(parallel)
library(portes)
library(urca)
library(dplyr)
library(tseries)
# remove (almost) everything in the working environment, such as local macro variables.
remove(list = ls())

set.seed(987654321)

# define a that estimates our models and saves results. We call it "model". It's a function of the variables, n.
model <- function(n){ #

n <- 54
t <- seq(1:n) #generate a sequence of {0,1,2,...,n}

# Set values for bounds
#See Grant and Lebo Table I.1 and Grant and Lebo replication code
#bounds of mood, 49 to 71
a <- 0.5
b <- 0.5
tau <- 59
k <- 5.49998

e<-rnorm(n,0,2) #generate a random variable e which has a normal distribution with mean=0; variance=2

# Generating starting values for DV, bound b/t 1 and 100
dv1 <- c(1:n,NA) #generate a vector of 1 by n (dv)
dv1[1] <- runif(1, min=50, max=59)
    for(i in 2:n){ #for values 2 through n of dv1, replace with bounded unit root
  		dv1[i] <- dv1[i-1] + exp(-k)*(exp((-a)*(dv1[i-1] - tau)) - (exp((b)*(dv1[i-1] - tau)))) + e[i]
    }

#Grant and Lebo estimate d=1.44 for policy liberalism and d=.83 for inequality

    #generate fractionally integrated series
    #because fracdiff.sim only generated -.5<d<.5, generate (d-1) and then add 1
    d <- fracdiff.sim(n, d = .44) #
    dpl <- as.ts(d$series) #save fractionally integrated series y_frac as a time series
    xpl <- c(1:n,NA) #generate a vector of 1 by n with all missing values (NA)
    xpl[1]<-dpl[1] #replace the first element of xpl with the first element of d44
    for(i in 2:n){ #for values 2 through n of xpl, replace with lagged value + FI series to produce d=1.44
  		xpl[i] <- xpl[i - 1] + dpl[i]
    }

    d <- fracdiff.sim(n, d = -.17) #
    din <- as.ts(d$series) #save fractionally integrated series y_frac as a time series
    xin <- c(1:n,NA) #generate a vector of 1 by n (y_d62) with all missing values (NA)
    xin[1]<-din[1] #replace the first element of xin with the first element of din
    for(i in 2:n){ #for values 2 through n of xpl, replace with lagged value + FI series to produce d=.83
  		xin[i] <- xin[i - 1] + din[i]
    }

  dv1 <- dv1[1:n]
  xpl <- xpl[1:n]
  xin <- xin[1:n]
    data <- data.frame(cbind(t,dv1,xpl,xin)) #combine all time series that we have generated so far.


    write.dta(data, file = "data.dta") #export data in the DTA format

    data$lxpl <- lag(data$xpl) #generate lagged value
    data$lxin <- lag(data$xin) #generate lagged value
    data$ldv1 <- lag(data$dv1) #generate lagged value

    data$d_xpl <- data$xpl - data$lxpl #generate first-differenced
    data$d_xin <- data$xin - data$lxin #generate first-differenced
    data$d_dv1 <- data$dv1 - data$ldv1 #generate first-differenced

    #test for a unit root in dv1
    mlag <- as.integer(12*(n/100)^(1/4)) #Use Schwert's (1987) criterion for the max n of lag length
    adf_test <- ur.df(data$dv1, type="drift", lags=mlag, selectlag="AIC")
    adf_dv_s <- summary(adf_test)
    df_dv_tstat <- adf_dv_s@teststat[1,1]
    df_dv_cv <- adf_dv_s@cval[1,2]

    #estimate the GECM
    ecm <- lm(d_dv1~ldv1 + d_xin + lxin + d_xpl + lxpl, data=data)
    beta_ldv1<- summary(ecm)$coefficients[2,1] #save coefficient for ldv1
    beta_d_xin<- summary(ecm)$coefficients[3,1] #save coefficient for d_xin
    beta_lxin<- summary(ecm)$coefficients[4,1]
    beta_d_xpl<- summary(ecm)$coefficients[5,1]
    beta_lxpl<- summary(ecm)$coefficients[6,1]

    se_ldv1<- summary(ecm)$coefficients[2,2] #save std. error for the coefficient
    se_d_xin<- summary(ecm)$coefficients[3,2] #save std. error for the coefficient
    se_lxin<- summary(ecm)$coefficients[4,2]
    se_d_xpl<- summary(ecm)$coefficients[5,2]
    se_lxpl<- summary(ecm)$coefficients[6,2]
    tstat_ldv1<-beta_ldv1/se_ldv1 #save t statistic of the coefficient
    tstat_d_xin<-beta_d_xin/se_d_xin
    tstat_lxin<-beta_lxin/se_lxin
    tstat_d_xpl<-beta_d_xpl/se_d_xpl
    tstat_lxpl<-beta_lxpl/se_lxpl


#save all the results and parameters in results file.
    results<-cbind(n,tstat_ldv1,tstat_lxin,tstat_lxpl,df_dv_tstat,df_dv_cv)
    return(results)
}

#Run simulations
n_list<-c(54) #define the set of values n takes (n=number of T for each series).
nobs <- length(n_list) #save the length of n_list, which is 1 as we only have one element in n_list.
numparams<- 7 #set the number of columns in the output to which results are saved for each iteration
reps <- 1000 #set the number of iteration
d <- data.frame(matrix(, nrow = nobs*reps, ncol = numparams)) #d is the matrix that we save our results to.
i <- 0 #define i so that we can use it to save results to d[i,] for each iteration
output = NULL
for(n in n_list) {
            for(i3 in 1:reps) {
                i <- i + 1
                d[i,] <- cbind(i3,model(n)) #for each iteration i3, we save the results computed from model(n).
            }
        }
colnames(d) = c("iterations","n","tstat_ldv1","tstat_lxin","tstat_lxpl","df_dv_stat","df_dv_cv")
write.dta(d, file = "KellyEnns_sim_outputs.dta")
attach(d)
#Cointegration w/ EC test if evidence that DV contains a unit root
100*sum(tstat_ldv1 < -3.570 & df_dv_stat<df_dv_cv)/reps
#spurious regression on "inequality" variable
100*sum((tstat_lxin < -1.96 & tstat_ldv1 < -3.570) | (tstat_lxin > 1.96 & tstat_ldv1 < -3.570))/reps
